0: Preparation

Defining the input and output files

# Clean the working environment
rm(list = ls())

knitr::opts_chunk$set(echo = TRUE)

empirical_species <- "German Shepherd"
document_title <- paste(empirical_species," Pipeline Results")

MAF_pruning_used <- TRUE

N_e <- 50
# document_sub <- paste("MAF 0.05 used, but only for H_E-computation, N_e=", N_e)




if (MAF_pruning_used == FALSE) {
  document_sub <- paste("No MAF-based pruning used, N_e =", N_e)


} else {
  document_sub <- paste("MAF-based pruning used, N_e =", N_e)
}

############################################ 
# Parameters used for displaying the result
############################################ 
ROH_frequency_decimals <- 1
H_e_values_decimals <- 5
F_ROH_values_decimals <- 3





####################################  
# Defining the input files
#################################### 

Selection_strength_test_dir <- Sys.getenv("Selection_strength_test_dir")

Sweep_test_dir <-  Sys.getenv("Sweep_test_dir")

############### 
## Empirical ###
###############

### ROH hotspots ###
Empirical_data_ROH_hotspots_dir  <- Sys.getenv("Empirical_data_ROH_hotspots_dir")
Empirical_data_autosome_ROH_freq_dir <- Sys.getenv("Empirical_data_autosome_ROH_freq_dir")
### Inbreeding coefficient ###

Empirical_data_F_ROH_dir  <- Sys.getenv("Empirical_data_F_ROH_dir")

### Expected Heterozygosity distribution ###
Empirical_data_H_e_dir <- Sys.getenv("Empirical_data_H_e_dir")

############### 
## Simulated ###
###############

### Causative variant ###
variant_freq_plots_dir <- Sys.getenv("variant_freq_plots_dir")
variant_position_dir <- Sys.getenv("variant_position_dir")
pruned_replicates_count_dir <- Sys.getenv("pruned_replicates_count_dir")

Selection_causative_variant_windows_dir <- Sys.getenv("Selection_causative_variant_windows_dir")
causative_variant_windows_H_e_dir <- Sys.getenv("causative_variant_windows_H_e_dir")

### ROH hotspots ###
Neutral_model_ROH_hotspots_dir <- Sys.getenv("Neutral_model_ROH_hotspots_dir")
Neutral_model_autosome_ROH_freq_dir <- Sys.getenv("Neutral_model_autosome_ROH_freq_dir")

Selection_model_ROH_hotspots_dir  <- Sys.getenv("Selection_model_ROH_hotspots_dir")
Selection_model_autosome_ROH_freq_dir <- Sys.getenv("Selection_model_autosome_ROH_freq_dir")

### Inbreeding coefficient ###
Neutral_model_F_ROH_dir  <- Sys.getenv("Neutral_model_F_ROH_dir")
Selection_model_F_ROH_dir  <- Sys.getenv("Selection_model_F_ROH_dir")

### Expected Heterozygosity distribution ###
Neutral_model_H_e_dir <- Sys.getenv("Neutral_model_H_e_dir")
Selection_model_H_e_dir <- Sys.getenv("Selection_model_H_e_dir")


histogram_line_sizes <- 3
empirical_data_color <- "darkgreen"
neutral_model_color <- "blue" 
selection_model_color <- "purple"

output_dir <- Sys.getenv("Pipeline_results_output_dir") 


# Sys.getenv()  

# # Set the working directory for notebook chunks
# knitr::opts_knit$set(root.dir = output_dir_sweep_test)

Loading libraries

library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1

Causative variant

Variant fixation

Fixation probability

Fixation time

# Function to determine the number of rows until fixation is reached
time_to_fixation <- function(data, threshold = 0.9) {
  # Find the index of the first row where the allele frequency is 0.9 or higher
  fixation_index <- which(data$allele_frequency >= threshold)[1]
  
  # Return the number of rows until fixation is reached
  return(fixation_index - 1)
}

Summary - Variant fixation

Selection_coefficient Fixation_probability Avg_Fixation_time Min_fixation_time Max_fixation_time
s=0.05 0.3 34.15 25 39
s=0.1 1.5 33.70 25 39
s=0.2 16.0 32.65 24 39
s=0.3 27.4 26.35 17 39
s=0.4 41.7 22.50 12 35
s=0.5 54.1 16.15 8 23
s=0.6 54.1 12.15 8 16
s=0.7 41.7 9.10 7 13
s=0.8 52.6 8.05 6 12

Causative variant windows

Variant position

Window lengths

Causative variant windows

Summary - Causative variant windows

Selection_coefficient Avg_Length_Mb Avg_window_freq Min_freq Max_freq Avg_freq_variant_100k_window
4 s=0.3 4.592906 10.7 0 68 10.8
2 s=0.1 5.093953 38.6 0 84 39.6
1 s=0.05 5.763953 29.8 0 82 29.7
6 s=0.5 5.793953 4.4 0 38 3.6
5 s=0.4 6.965000 8.3 0 66 7.6
3 s=0.2 7.050000 24.0 0 80 23.8
7 s=0.6 7.953953 9.1 0 68 8.7
8 s=0.7 9.608953 7.9 0 50 7.5
9 s=0.8 10.898953 6.5 0 52 5.8

Standard Error Confidence interval function

Confidence level <=> konfidensgrad

# Function to calculate standard error and its confidence interval
standard_error_confidence_interval_fun <- function(observed_data, confidence_level = 0.95) {
  
  # Calculate standard error
  n <- length(observed_data)
  standard_deviation <- sd(observed_data)
  standard_error <- standard_deviation / sqrt(n - 1)
  
  # Calculate confidence interval based on standard error

  alpha <- (1 - confidence_level) / 2
  margin_of_error <- qnorm(1 - alpha) * standard_error
  mean_estimate <- mean(observed_data)
  # Calculate the percentiles for the confidence interval
  confidence_interval_lower_bound <- mean_estimate - margin_of_error # 2.5th percentile (2σ)
  confidence_interval_upper_bound <- mean_estimate + margin_of_error # 97.5th percentile (2σ)
  
  # Return confidence interval
  return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))
}





# # Function to calculate bootstrap confidence intervals
# standard_error_confidence_interval_fun <- function(observed_data, n_bootstraps = 1000, confidence_level = 0.95) {
#   
#   # Function to calculate the mean for each bootstrap sample
#   compute_bootstrap_mean_fun <- function(observed_data_urn) {
#     bootstrap_dataset <- sample(observed_data_urn, replace = TRUE)
#     bootstrap_estimate <- mean(bootstrap_dataset)
#     return(bootstrap_estimate)
#   }
#   
#   # Perform bootstrap resampling
#   bootstrap_point_estimates <- replicate(n_bootstraps, compute_bootstrap_mean_fun(observed_data))
#   
#   # Calculate the percentiles for the confidence interval
#   alpha <- (1 - confidence_level) / 2
#   confidence_interval_lower_bound <- quantile(bootstrap_point_estimates, alpha) # 2.5th percentile
#   confidence_interval_upper_bound <- quantile(bootstrap_point_estimates, 1 - alpha) # 97.5th percentile
#   
#   # Return the confidence interval
#   return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))
# }

1: ROH-Frequency

1.1 : Autosome ROH-frequencies

1.1.1 : Empirical - ROH frequency

1.1.2: Neutral Model - ROH frequency

1.1.3: Selection Model

1.1.4 Summary - ROH-hotspot threshold

## ROH-hotspot selection testing results://n
Model Avg_ROH_hotspot_threshold
Neutral 28.7
s=0.05 33.2
s=0.1 35.0
s=0.5 35.1
s=0.4 35.2
s=0.2 35.8
s=0.3 36.8
s=0.7 37.6
s=0.6 38.5
s=0.8 39.4
Empirical 75.0

1.2 ROH-hotspots - ROH Frequency and Length

2: Inbreeding coefficient

2.1 Empirical data

## Overall Average Avg_F_ROH for  german_shepherd : 0.2753012 //n

2.2 Neutral Model

## Point estimate F_ROH across all 20 simulations: 0.2382806 //n
## [1] "Bootstrap 95% Confidence Interval: [0.225944707258446, 0.250616452741554]"

2.3 Selection Model

## Point estimate F_ROH across all 20 simulations for  selection_model_s005_chr3 : 0.2730208 //n[1] "Bootstrap 95% Confidence Interval: [0.257821701861417, 0.288219838138583]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s01_chr3 : 0.2817853 //n[1] "Bootstrap 95% Confidence Interval: [0.262926084924311, 0.300644575075689]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s02_chr3 : 0.2916789 //n[1] "Bootstrap 95% Confidence Interval: [0.27582288465812, 0.30753487534188]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s03_chr3 : 0.2949622 //n[1] "Bootstrap 95% Confidence Interval: [0.275270344867574, 0.314654015132426]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s04_chr3 : 0.2765845 //n[1] "Bootstrap 95% Confidence Interval: [0.257329413481911, 0.295839626518089]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s05_chr3 : 0.2757632 //n[1] "Bootstrap 95% Confidence Interval: [0.263069394392445, 0.288457025607555]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s06_chr3 : 0.2953692 //n[1] "Bootstrap 95% Confidence Interval: [0.280072875313516, 0.310665584686484]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s07_chr3 : 0.2830673 //n[1] "Bootstrap 95% Confidence Interval: [0.261165531872551, 0.304969108127449]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s08_chr3 : 0.2847513 //n[1] "Bootstrap 95% Confidence Interval: [0.250907729600213, 0.318594910399787]"

2.4 F_ROH summary

Model F_ROH Lower_CI Upper_CI
Neutral 0.238 0.226 0.251
s=0.05 0.273 0.258 0.288
Empirical 0.275 NA NA
s=0.5 0.276 0.263 0.288
s=0.4 0.277 0.257 0.296
s=0.1 0.282 0.263 0.301
s=0.7 0.283 0.261 0.305
s=0.8 0.285 0.251 0.319
s=0.2 0.292 0.276 0.308
s=0.3 0.295 0.275 0.315
s=0.6 0.295 0.280 0.311

2.4.1 Visualizaing F_ROH

## Models where empirical F_ROH is within CI:
## [1] "s=0.05" "s=0.1"  "s=0.3"  "s=0.4"  "s=0.5"  "s=0.7"  "s=0.8"
## 
## Models where empirical F_ROH is outside CI:
## [1] "s=0.2"   "s=0.6"   "Neutral"

3: Expected Heterozygosity

3.1 Empirical data

3.2 Neutral Model

3.3 Selection Model

## Uncommented because change of analysis

3.4 Causative Variant Window

## Point estimate H_e across all 20 simulations for  s005_chr3 : 0.3136506 //n[1] "Bootstrap 95% Confidence Interval: [0.274729948288615, 0.35257127281979]"
## Point estimate H_e across all 20 simulations for  s01_chr3 : 0.2437236 //n[1] "Bootstrap 95% Confidence Interval: [0.208784537367758, 0.278662647234207]"
## Point estimate H_e across all 20 simulations for  s02_chr3 : 0.287157 //n[1] "Bootstrap 95% Confidence Interval: [0.250885743940252, 0.323428304819524]"
## Point estimate H_e across all 20 simulations for  s03_chr3 : 0.2640354 //n[1] "Bootstrap 95% Confidence Interval: [0.213138129976104, 0.314932680147393]"
## Point estimate H_e across all 20 simulations for  s04_chr3 : 0.3228381 //n[1] "Bootstrap 95% Confidence Interval: [0.289478390746543, 0.356197709502665]"
## Point estimate H_e across all 20 simulations for  s05_chr3 : 0.3361865 //n[1] "Bootstrap 95% Confidence Interval: [0.292957745879232, 0.379415174481695]"
## Point estimate H_e across all 20 simulations for  s06_chr3 : 0.3358466 //n[1] "Bootstrap 95% Confidence Interval: [0.288658400834631, 0.383034885590995]"
## Point estimate H_e across all 20 simulations for  s07_chr3 : 0.3380687 //n[1] "Bootstrap 95% Confidence Interval: [0.28983399187073, 0.386303426756603]"
## Point estimate H_e across all 20 simulations for  s08_chr3 : 0.3360891 //n[1] "Bootstrap 95% Confidence Interval: [0.292090306442711, 0.380087917137017]"

3.4 Genomewide 5th percentile comparison - Expected Heterozygosity Summary

Model H_e_5th_percentile Lower_CI Upper_CI
s01_chr3 0.15285 0.14262 0.16309
s08_chr3 0.15927 0.14455 0.17400
s02_chr3 0.15996 0.15177 0.16815
s07_chr3 0.16193 0.15085 0.17300
s03_chr3 0.16287 0.15123 0.17452
s04_chr3 0.16637 0.15725 0.17549
Neutral 0.16683 0.16108 0.17258
s005_chr3 0.16904 0.16161 0.17647
s05_chr3 0.17357 0.16482 0.18232
s06_chr3 0.17614 0.16712 0.18516
Empirical NA NA NA

4: Summary

4.0 General comparison

4.0.1 ROH-hotspot threshold comparison

## 
##  ROH-hotspot threshold comparison between the different datasets
Model Avg_ROH_hotspot_threshold
Neutral 28.7
s=0.05 33.2
s=0.1 35.0
s=0.5 35.1
s=0.4 35.2
s=0.2 35.8
s=0.3 36.8
s=0.7 37.6
s=0.6 38.5
s=0.8 39.4
Empirical 75.0

4.0.2 F_ROH comparison

Model F_ROH Lower_CI Upper_CI
Neutral 0.238 0.226 0.251
s=0.05 0.273 0.258 0.288
Empirical 0.275 NA NA
s=0.5 0.276 0.263 0.288
s=0.4 0.277 0.257 0.296
s=0.1 0.282 0.263 0.301
s=0.7 0.283 0.261 0.305
s=0.8 0.285 0.251 0.319
s=0.2 0.292 0.276 0.308
s=0.3 0.295 0.275 0.315
s=0.6 0.295 0.280 0.311

4.1: Hotspot comparison

4.1.1: Selection test (Sweep test)

## [1] "Selection test results"
## [1] "ROH-hotspot windows with an mean H_e Value lower or equal to the lower confidence interval of the fifth percentile of the neutral model are classified as being under selection"
## [1] "5th percentile of the neutral model is: 0.1610771"
Name Under_selection Window_based_Average_H_e
Hotspot_chr3_window_1 Yes 0.10706
Hotspot_chr3_window_3 Yes 0.12371
Hotspot_chr3_window_2 Yes 0.14453
Hotspot_chr17_window_2 Yes 0.14866
Hotspot_chr17_window_1 Yes 0.15285
Hotspot_chr19_window_1 No 0.16270
## [1] "ROH-hotspots under selection:"
Name Under_selection Window_based_Average_H_e
Hotspot_chr3_window_1 Yes 0.10706
Hotspot_chr3_window_3 Yes 0.12371
Hotspot_chr3_window_2 Yes 0.14453
Hotspot_chr17_window_2 Yes 0.14866
Hotspot_chr17_window_1 Yes 0.15285

4.1.2: Selection Strength Test (Sweep test)

4.1.1.1 Extracting Causative windows under selection

Model H_e Lower_CI Upper_CI Under_Selection
Neutral 0.16683 0.16108 0.17258 No
s=0.1 0.24372 0.20878 0.27866 No
s=0.3 0.26404 0.21314 0.31493 No
s=0.2 0.28716 0.25089 0.32343 No
s=0.05 0.31365 0.27473 0.35257 No
s=0.4 0.32284 0.28948 0.35620 No
s=0.6 0.33585 0.28866 0.38303 No
s=0.8 0.33609 0.29209 0.38009 No
s=0.5 0.33619 0.29296 0.37942 No
s=0.7 0.33807 0.28983 0.38630 No

4.1.1.1 Extracting Hotspots under selection

*** Behöver bootstrap av 5th percentiles från neutral model

Hotspot - Causative Window - Comparison

Model Lengths_Mb Avg_ROH_Freq
Hotspot_chr3_window_1 10.900000 81.3
s=0.8 10.898953 6.5
s=0.7 9.608953 7.9
s=0.6 7.953953 9.1
s=0.2 7.050000 24.0
s=0.4 6.965000 8.3
s=0.5 5.793953 4.4
s=0.05 5.763953 29.8
s=0.1 5.093953 38.6
s=0.3 4.592906 10.7
Hotspot_chr3_window_2 3.200000 76.3
Hotspot_chr3_window_3 2.700000 77.6
Hotspot_chr17_window_1 2.000000 76.4
Hotspot_chr17_window_2 0.600000 76.1

Visualizing and scaling

5 Visualizing Expected Heterozygoisty

5.1 Bootstrap CI for mean 5th percentile H_e

## Models where empirical H_e is within CI for Hotspot_chr3_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr3_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr3_window_3 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr3_window_3 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr3_window_2 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr3_window_2 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr17_window_2 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr17_window_2 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr17_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr17_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr19_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr19_window_1 :
## character(0)

5.2 5th percentile of the extreme H_e values